##############################
# Media Measurement Matters  #
# Replication Code           #
# Pre-Processing of URLS     #
##############################

# The following file includes the steps for pre-processing URLs in the web-tracking
# data. In order to protect respondent privacy, we do not share the raw data that 
# contains these URLs, as such URLs could be used to personally identify individual
# respondents. As such, the provided code is not able to be run in its current form.
# However, we have included this code as a record of our data cleaning steps to
# be transparent about our approach.

# Load libraries
library(tidyverse)
library(urltools)

# Set up helper operations
`%notin%` <- Negate(`%in%`)

# Read in web-browsing data (full set of site visits)
web <- read_delim(file = "data/CONFIDENTIAL_raw_web_urls.txt",
                  col_names = TRUE, delim = "\t") %>% 
  as.data.frame()

# Make formatting of URLs and domains consistent
web <- web %>% 
  mutate(url = tolower(url), # Make URLs lowercase
         domain = tolower(domain)) # Make domains lowercase

# Prep BMA data ----

# > Recode Yahoo! domains for BMA ----

# Recode the domains associated with Yahoo! to match the naming conventions used
# by BMA in their scoring

# Extract the URLs associated with Yahoo domains (e.g., yahoo.com/news, yahoo.com/gma)
yahoo_urls <- web[which(web$domain == "yahoo.com"), "url"]

# Extract the value after the first backlash in the URL path (e.g., news, gma)
yahoo_path <- sub("\\/.*", "", url_parse(yahoo_urls)$path)

# If URL associated with GMA, code as gma.yahoo.com. Otherwise, code as news.yahoo.com 
web[which(web$domain == "yahoo.com"), 
    "domain"] <- ifelse(yahoo_path == "gma", "gma.yahoo.com",
                        "news.yahoo.com")

# > Identify URLs with "hard news" ----

# Here, we predict whether a URL is likely to correspond to "hard news" using
# keyword matching, based on the path of each URL. We exclude homepage visits
# to Yahoo! News (yahoo.com/news), even though the path specifically includes
# the term "news," in order to be more conservative in our approach.

# Extract the path for each URL
web$path <- url_parse(web$url)$path

# Define dictionaries of key terms, as discussed in Appendix L

# Based on Bakshy, Messing, and Adamic (2015)
bma <- c("politi", "usnews", "world", "national", "state", "elect", "vote",
         "govern", "campaign", "war[^a-z]", "polic", "econ", "unemploy", "racis", 
         "energy", "abortion", "educa", "healthcare", "immigr")

# Based on Tyler, Grimmer, and Iyengar (2021)
tgi <- c("trump", "politic", "clinton", "obama", "elect", "hillary", "campaign", "vote",
         "debate", "president", "kaepernick", "tax", "government", "washington",
         "donald", "republican", "gop", "democrat", "voting", "polls", "pence",
         "congress", "candidate", "senate", "comey", "governor", "dnc", "weiner",
         "parties", "endorsement", "ballot", "wikileak", "kaine", "assange", "sanders",
         "breitbart", "abedin", "manafort", "rnc", "bernie", "melania", "pelosi",
         "proposition", "ivanka", "turnout", "chaffetz", "flynn", "representative",
         "barack", "pacs", "superpac", "election-us")

# Based on Guess (2021) 
guess <- c("africa", "aponline", "asia", "business", "economy", "education", "europe",
           "international", "middleeast", "national", "news", "nyregion", "politics",
           "upshot", "world", "opinion", "nytfrontpage", "opiniontoday",
           "reuters", "elections", "washington", "allpolitics", "americas", "asiapcf",
           "justice", "poli-tics", "hannity", "columns", "editorial",
           "gerrymandering", "govbeat", "metro", "wonkblog", "blogs", 
           "us[^a-z]", "capitalbusiness")

# Combine dictionaries 
keywords <- c(bma, tgi, guess) %>% sort() %>% unique()
keywords <- paste0("[^a-z]", keywords)

# Identify cases where the URL is just the yahoo.com/news homepage
yahoo_home <- c("https://www.yahoo.com/news/", "https://www.yahoo.com/news", 
                "http://www.yahoo.com/news/", "http://www.yahoo.com/news")

# Code URL as "hard news" news if path contains one of the dictionary terms,
# treating Yahoo! News homepage visits as soft news
  # Note: hard_news_nohome is used to exclude homepage visits from the denominator 
  # for Figure L1 in Appendix L.

web <- web %>% 
  mutate(path = case_when(is.na(path) ~ NA_character_,
                          TRUE ~ paste0("/", path))) %>% 
  mutate(hard_news = case_when(url %in% yahoo_home ~ 0, # Remove homepage visits
                               str_detect(path, paste(keywords, collapse = "|")) ~ 1,
                               is.na(path) ~ 0, TRUE ~ 0),
         hard_news_nohome = case_when(url %in% yahoo_home ~ NA_real_, # Remove homepage visits
                                      str_detect(path, paste(keywords, collapse = "|")) ~ 1,
                                      is.na(path) ~ NA_real_, TRUE ~ 0))

# Recode domains for Eady et al. scores ----

# Re-format domains for matching to the Eady et al. scores
web <- web %>% 
  mutate(eady_domain = toupper(domain),
         eady_domain = case_when(eady_domain == "GMA.YAHOO.COM" ~ "YAHOO.COM/GMA",
                                 eady_domain == "NEWS.YAHOO.COM" ~ "YAHOO.COM/NEWS",
                                 TRUE ~ eady_domain)) 

# > Deal with Buzzfeed URLs ----

# Eady et al. just report a score for buzzfeednews.com, whereas comScore just
# tags buzzfeed.com, such that not all visits are to this more specific news domain.

# Extract the URLs associated with the Buzzfeed domain
buzz_urls <- web[which(web$eady_domain == "BUZZFEED.COM"), "url"]

# Extract the value before the first backlash (e.g., news, quizzes)
buzz_path <- sub("\\/.*", "", url_parse(buzz_urls)$path)

# Examine how many URLs start with visits to news/buzzfeednews
table(buzz_path[grepl("news/|news$|buzzfeednews", buzz_path)])

# Code domain as buzzfeednews.com only if the first backslash contains news
web[which(web$eady_domain == "BUZZFEED.COM"), 
    "eady_domain"] <- ifelse(grepl("news/|news$|buzzfeednews", buzz_path), 
                             "BUZZFEEDNEWS.COM",
                             "BUZZFEED.COM")

# Identify sequential duplicates ----

# Define duplicates as cases of adjacent visits to the same URL by the
# same respondent on the same day

web <- web %>% 
  mutate(date = as.Date(timestamp), # Identify date of site visit
         dupe = ifelse(url == lag(url) & resp_id == lag(resp_id) & 
                         date == lag(date), 1, 0),
         dupe = case_when(is.na(dupe) ~ 0,
                          TRUE ~ dupe)) # Remove dupes

# Save web data ----

write_rds(web %>% select(-c(url, path)), "data/web_raw.rds")
write_csv(web %>% select(-c(url, path)), "data/web_raw.csv")
